This project analyzes Medicare provider data for the Tampa Bay region to understand service patterns, cost structures, and geopgraphic distributions. We utilize association rule mining, k-means clustering, linear regression, and geospatial visualization to explore relationships within the data. The primary data source is the CMS Medicare Provider Utilization and Payment Data Public Use File. Our goal is to identify key trends and factors influencing healthcare delivery and costs in this specific geographic area.
# Add your library below.
# List of required packages
required_packages <- c(
"tidyverse",
"GGally",
"ggplot2",
"sf",
"dplyr",
"readr",
"arules",
"leaflet",
"tigris",
"scales",
"htmlwidgets",
"jsonlite",
"purrr",
"stringr",
"lme4",
"broom.mixed",
"tibble",
"grepel"
)
# Install any packages that are not already installed
to_install <- setdiff(required_packages, rownames(installed.packages()))
if (length(to_install) > 0) {
install.packages(to_install, dependencies = TRUE, repos = "https://cloud.r-project.org/")
}
# Load all required packages and check if successfully loaded
for (pkg in required_packages) {
if (!require(pkg, character.only = TRUE)) {
warning(sprintf("Package '%s' could not be loaded.", pkg))
}
}
# Enable caching for tigris package
options(tigris_use_cache = TRUE)
# Tampa Bay ZIP codes
tampa_bay_zips <- as.numeric(c(
"34423","34428","34429","34430","34431","34432","34433","34434","34436","34442","34445",
"34446","34447","34448","34449","34450","34451","34452","34453","34461","34465","34601","34602","34604","34605",
"34606","34607","34608","34609","34613","34614","33510","33511","33527","33534","33547","33548","33549","33556",
"33558","33559","33563","33565","33566","33567","33569","33570","33572","33573","33578","33579","33584","33586",
"33592","33594","33596","33602","33603","33604","33605","33606","33607","33609","33610","33611","33612","33613",
"33614","33615","33616","33617","33618","33619","33620","33621","33624","33625","33626","33629","33634","33635",
"33637","33647","34201","34202","34203","34205","34207","34208","34209","34210","34211","34212","34215","34216",
"34217","34219","34221","34222","33523","33524","33525","33537","33539","33540","33541","33542","33543","33544",
"33545","33576","34610","34637","34638","34639","34652","34653","34654","34655","34667","34668","34669","34673",
"34674","34679","34680","34690","34691","33701","33702","33703","33704","33705","33706","33707","33708","33709",
"33710","33711","33712","33713","33714","33715","33716","33755","33756","33759","33760","33761","33762","33763",
"33764","33765","33767","33770","33771","33772","33773","33774","33776","33777","33778","33781","33782","33785",
"34677","34681","34683","34684","34685","34688","34689","33801","33803","33805","33809","33810","33811","33812",
"33813","33815","33823","33825","33830","33834","33837","33838","33839","33841","33843","33844","33847","33849",
"33850","33851","33853","33859","33860","33867","33868","33870","33873","33875","33876","33877","33880","33881",
"33882","33884","33896","33898","34231","34232","34233","34234","34235","34236","34237","34238","34239","34240",
"34241","34242","34243","34251","34275","34276","34277","34285","34286","34287","34288","34289","34292","34293",
"34295"
))
# Assigning a datapath for the filtered_data.csv
filtered_data_file_path <- "Data/filtered_data.csv"
# Bypassing the cleaning step if the file exists from previous use
if (!file.exists(filtered_data_file_path)) {
filtered_data <- read_csv("Data/Medicare_dataset.csv", show_col_types = FALSE) %>%
# Amending mistakenly attributed ZIP codes from original dataset
mutate(Rndrng_Prvdr_Zip5 = ifelse(Rndrng_Prvdr_Zip5 %in% c("19107", "19144"), "33308", Rndrng_Prvdr_Zip5)) %>%
filter(grepl("^\\d+$", Rndrng_Prvdr_Zip5)) %>%
mutate(Rndrng_Prvdr_Zip5 = as.numeric(Rndrng_Prvdr_Zip5)) %>%
# Filter by Tampa Bay ZIP codes
filter(Rndrng_Prvdr_Zip5 %in% tampa_bay_zips) %>%
# Rename columns to be more immediately understandable
rename(
NPI = Rndrng_NPI,
Provider_Last_Name = Rndrng_Prvdr_Last_Org_Name,
Provider_First_Name = Rndrng_Prvdr_First_Name,
Provider_MI = Rndrng_Prvdr_MI,
Provider_Credentials = Rndrng_Prvdr_Crdntls,
Entity_Code = Rndrng_Prvdr_Ent_Cd,
Provider_Street1 = Rndrng_Prvdr_St1,
Provider_Street2 = Rndrng_Prvdr_St2,
Provider_City = Rndrng_Prvdr_City,
Provider_State = Rndrng_Prvdr_State_Abrvtn,
FIPS_Code = Rndrng_Prvdr_State_FIPS,
Provider_Zip = Rndrng_Prvdr_Zip5,
RUCA_Code = Rndrng_Prvdr_RUCA,
RUCA_Description = Rndrng_Prvdr_RUCA_Desc,
Provider_Country = Rndrng_Prvdr_Cntry,
Provider_Type = Rndrng_Prvdr_Type,
Medicare_Participation = Rndrng_Prvdr_Mdcr_Prtcptg_Ind,
HCPCS_Code = HCPCS_Cd,
HCPCS_Description = HCPCS_Desc,
HCPCS_Drug_Indicator = HCPCS_Drug_Ind,
Place_Of_Service = Place_Of_Srvc,
Total_Beneficiaries = Tot_Benes,
Total_Services = Tot_Srvcs,
Total_Beneficiary_Service_Days = Tot_Bene_Day_Srvcs,
Avg_Submitted_Charge = Avg_Sbmtd_Chrg,
Avg_Medicare_Allowed = Avg_Mdcr_Alowd_Amt,
Avg_Medicare_Payment = Avg_Mdcr_Pymt_Amt,
Avg_Medicare_Standardized_Amt = Avg_Mdcr_Stdzd_Amt
) %>%
# Removing currency symbol from the following columns
mutate(
across(
.cols = c(Avg_Medicare_Payment, Avg_Medicare_Allowed, Avg_Submitted_Charge, Avg_Medicare_Standardized_Amt),
.fns = ~ gsub("\\$", "", as.character(.))
)
) %>%
select(-Provider_First_Name, -Provider_MI)
# Save the filtered data to a new CSV in the 'data' folder
write.csv(filtered_data, "Data/filtered_data.csv", row.names = FALSE)
# Print confirmation
print(paste("Filtered data saved to: Data/filtered_data.csv"))
} else{
print(paste("File '", filtered_data_file_path, "' already exists. Skipping creation."))
}
## [1] "File ' Data/filtered_data.csv ' already exists. Skipping creation."
Read in the filtered .csv file for easier use without having to load the initial dataset.
#Load the filtered data ONCE after cleaning and take care of problems of FIPS code error and zip map for k-mean map
filtered_data <- read_csv("Data/filtered_data.csv",
col_types = cols(FIPS_Code = col_character())) %>%
mutate(FIPS_Code = as.numeric(gsub("[^0-9]", "", FIPS_Code)), Provider_Zip = sprintf("%05s", as.character(Provider_Zip)))
This project detects potentially suspicious billing behavior among Medicare providers in the Tampa Bay area. Using association rule mining and a composite scoring model, it flags providers who may warrant audit review.
filtered_data <- read.csv("Data/filtered_data.csv")
clean_df <- filtered_data %>%
mutate(
Avg_Medicare_Payment = as.numeric(Avg_Medicare_Payment)
) %>%
filter(
!is.na(Provider_Type),
!is.na(HCPCS_Code),
!is.na(NPI),
as.numeric(Avg_Medicare_Payment) >= 10
) %>%
mutate(
NPI = as.character(NPI),
HCPCS_Code = as.character(HCPCS_Code),
Provider_Type = as.factor(trimws(Provider_Type))
) %>%
select(NPI, HCPCS_Code, Avg_Medicare_Payment, Provider_Type, Provider_Last_Name)
# Initialize lists to store results
all_rule_matches <- tibble() # For NPI-rule match scoring in Step 3
provider_summary_list <- list() # For rule-based provider summaries
# Step 2A: Split the dataset by specialty (Provider_Type)
specialty_groups <- clean_df %>%
group_by(Provider_Type) %>%
group_split()
# Step 2B: Loop through each specialty subset
for (spec_df in specialty_groups) {
spec_name <- as.character(unique(spec_df$Provider_Type))
if (nrow(spec_df) < 2) next
# Step 2C: Convert each NPI into a transaction of billed HCPCS codes
provider_tx <- spec_df %>%
group_by(NPI) %>%
summarise(items = list(unique(HCPCS_Code)), .groups = "drop")
tx_list <- lapply(split(provider_tx$items, provider_tx$NPI), unlist)
if (length(tx_list) < 2) next
trans <- as(tx_list, "transactions")
# Step 2D: Generate association rules for this specialty
rules <- apriori(trans, parameter = list(supp = 0.02, conf = 0.7, maxlen = 2))
if (length(rules) == 0) next
# Step 2E: Filter rules by lift and convert to data frame
rules_df_raw <- as(rules, "data.frame")
rules_df_filtered <- rules_df_raw %>% filter(lift >= 2)
if (nrow(rules_df_filtered) == 0) next
# Add readable labels and parse rule structure for each rule
rules_df_filtered$lhs <- labels(lhs(rules))[as.integer(rownames(rules_df_filtered))]
rules_df_filtered$rhs <- labels(rhs(rules))[as.integer(rownames(rules_df_filtered))]
rules_df_filtered <- rules_df_filtered %>%
mutate(
rule_label = paste(lhs, "=>", rhs),
lhs_items = strsplit(gsub("[{}]", "", lhs), ","),
rhs_items = strsplit(gsub("[{}]", "", rhs), ","),
all_items = map2(lhs_items, rhs_items, ~ unique(trimws(c(.x, .y))))
)
# Step 2F: Match rules to providers
provider_rule_map <- list()
npi_rule_log <- tibble()
for (npi in names(tx_list)) {
provider_items <- tx_list[[npi]]
matched <- rules_df_filtered %>%
filter(map_lgl(all_items, ~ all(.x %in% provider_items)))
if (nrow(matched) > 0) {
provider_rule_map[[npi]] <- matched$rule_label
npi_rule_log <- bind_rows(npi_rule_log, tibble(
NPI = npi,
Rule_Label = matched$rule_label,
Lift = matched$lift,
Confidence = matched$confidence
))
}
}
# Step 2G: Create provider summary stats for rule match counts
if (length(provider_rule_map) > 0) {
provider_summary <- tibble(
Specialty = spec_name,
NPI = names(provider_rule_map),
Total_Rules = sapply(provider_rule_map, function(x) length(unique(x)))
)
hcpcs_diversity <- spec_df %>%
group_by(NPI) %>%
summarise(Unique_Codes = n_distinct(HCPCS_Code), .groups = "drop")
provider_summary <- provider_summary %>%
left_join(hcpcs_diversity, by = "NPI") %>%
mutate(
Rules_Per_Code = round(Total_Rules / Unique_Codes, 2),
Unique_Codes_Per_Rule = round(Unique_Codes / Total_Rules, 4)
)
provider_summary_list[[spec_name]] <- provider_summary
all_rule_matches <- bind_rows(all_rule_matches, npi_rule_log)
}
}
# Export objects for Step 3
final_provider_results <- bind_rows(provider_summary_list)
npi_rule_scores <- all_rule_matches %>%
group_by(NPI) %>%
summarise(
mean_lift = mean(Lift, na.rm = TRUE),
mean_confidence = mean(Confidence, na.rm = TRUE),
.groups = "drop"
)
# Step 3A: Merge rule-based stats into the full cleaned Medicare dataset
merged_df <- clean_df %>%
left_join(final_provider_results, by = "NPI")
# Step 3B: Append average lift and confidence per provider from rule matches
scored_providers <- merged_df %>%
left_join(npi_rule_scores, by = "NPI")
# Step 3C: Z-score standardization for Rules_Per_Code within each specialty
scored_providers <- scored_providers %>%
group_by(Provider_Type) %>%
mutate(Z_Score_RPC = as.numeric(scale(Rules_Per_Code))) %>%
ungroup()
# Step 3D: Compute a composite audit score
# This blends:
# - billing repetition
# - rule lift
# - rule confidence
# - average Medicare payout
# - relative specialty behavior
scored_providers <- scored_providers %>%
mutate(
Composite_Audit_Score = round(
(coalesce(Rules_Per_Code, 0) * 1.5) +
(coalesce(mean_lift, 0) * 2.0) +
(coalesce(mean_confidence, 0) * 1.0) +
(coalesce(as.numeric(Avg_Medicare_Payment, 0) / 100)) +
coalesce(Z_Score_RPC, 0),
2
)
)
# Step 3E: Append sample size per provider for audit traceability
scored_providers <- scored_providers %>%
group_by(NPI) %>%
mutate(Sample_Size = n()) %>%
ungroup()
# Step 4A: Flag audit risk levels using percentile thresholds
# ------------------------------------------------------------
# We classify providers within each specialty based on Composite Score:
# - Top 2% = Extreme High Risk
# - 92–98% = High Risk
# - 80–92% = Moderate Risk
# - Bottom 80% = Low/No Risk
scored_flagged <- scored_providers %>%
group_by(Provider_Type) %>%
mutate(
p80 = quantile(Composite_Audit_Score, 0.80, na.rm = TRUE),
p92 = quantile(Composite_Audit_Score, 0.92, na.rm = TRUE),
p98 = quantile(Composite_Audit_Score, 0.98, na.rm = TRUE),
Audit_Flag = case_when(
Composite_Audit_Score >= p98 ~ "Extreme High Risk (98–100%)",
Composite_Audit_Score >= p92 ~ "High Risk (92–98%)",
Composite_Audit_Score >= p80 ~ "Moderate Risk (80–92%)",
TRUE ~ "Low/No Risk (0–80%)"
)
) %>%
ungroup()
# Step 4B: Filter to valid (non-NA, >0) composite scores
valid_scores_df <- scored_flagged %>%
filter(!is.na(Composite_Audit_Score), Composite_Audit_Score > 0)
# Step 4C: Manually set your "Top 15 Most Interesting" specialties
top15_specialties <- c(
"Anesthesiology", "Cardiology", "Dermatology", "Diagnostic Radiology",
"Emergency Medicine", "Family Practice", "Gastroenterology", "General Surgery",
"Internal Medicine", "Interventional Cardiology", "Nurse Practitioner",
"Orthopedic Surgery", "Physician Assistant", "Podiatry", "Vascular Surgery"
)
# Step 4D: Filter only the specialties of interest and cap extreme outliers
volume_filtered_df <- valid_scores_df %>%
filter(
Provider_Type %in% top15_specialties,
Composite_Audit_Score <= quantile(Composite_Audit_Score, 0.99, na.rm = TRUE),
Avg_Medicare_Payment <= 1000
)
# Step 4E: Generate plot with fixed x and y axis, no log scale
composite_plot <- ggplot(volume_filtered_df, aes(
x = Avg_Medicare_Payment,
y = Composite_Audit_Score,
color = Audit_Flag
)) +
geom_point(alpha = 0.7) +
facet_wrap(~ Provider_Type, scales = "fixed", nrow = 5) +
scale_color_manual(
values = c(
"Extreme High Risk (98–100%)" = "purple",
"High Risk (92–98%)" = "red",
"Moderate Risk (80–92%)" = "gold",
"Low/No Risk (0–80%)" = "green"
)
) +
scale_x_continuous(
labels = scales::dollar_format(),
breaks = c(0, 250, 500, 750, 1000),
limits = c(0, 1000)
) +
scale_y_continuous(
limits = c(0, 85),
breaks = seq(0, 80, by = 20)
) +
theme_minimal(base_size = 13) +
theme(strip.text = element_text(size = 10)) +
labs(
title = "Composite Audit Score vs. Medicare Payment",
subtitle = "Top 15 Most Interesting Specialties – Risk Segmentation at 80/92/98 Percentiles",
x = "Average Medicare Payment ($)",
y = "Composite Audit Score",
color = "Audit Risk Level"
)
# Step 4F: Save output as PNG
ggsave("Top15_Composite_vs_Payment_BySpecialty_FIXEDAXIS.png",
plot = composite_plot, dpi = 300, width = 14, height = 9)
# Step 4G: Render in output
composite_plot
# Step 5A: Join Provider_Last_Name from filtered_data, and rename it cleanly
scored_named <- scored_providers %>%
left_join(
filtered_data %>%
mutate(NPI = as.character(NPI)) %>%
select(NPI, Provider_Last_Name),
by = "NPI"
) %>%
rename(Provider_Last_Name = Provider_Last_Name.y) # in case it became .y
# Step 5B: Top 50 providers overall
top50_overall <- scored_named %>%
distinct(NPI, .keep_all = TRUE) %>%
arrange(desc(Composite_Audit_Score)) %>%
slice_head(n = 50)
# Display Top 50 Table
knitr::kable(
top50_overall %>%
select(Provider_Last_Name, NPI, Provider_Type, Composite_Audit_Score),
caption = "Top 50 Providers Overall by Composite Audit Score"
)
| Provider_Last_Name | NPI | Provider_Type | Composite_Audit_Score |
|---|---|---|---|
| Quest Diagnostics Clinical Laboratories Inc | 1891731626 | Clinical Laboratory | 435.67 |
| Laboratory Corporation Of America | 1255314704 | Clinical Laboratory | 395.91 |
| Vanvliet | 1841372968 | Critical Care (Intensivists) | 124.67 |
| Pine Brook Pharmacy Llc | 1558596932 | Clinical Laboratory | 97.41 |
| Rojas | 1134383060 | Pathology | 96.04 |
| Mittal | 1992742084 | Pathology | 96.04 |
| Mathew | 1598296097 | Pathology | 96.02 |
| Martens | 1144683467 | Pathology | 95.23 |
| Zhao | 1578603767 | Pathology | 95.21 |
| Patel | 1710368501 | Pathology | 95.07 |
| Lee | 1871628529 | Pathology | 94.93 |
| Miller | 1922111699 | Pathology | 94.85 |
| Robens | 1619116407 | Pathology | 94.82 |
| Mcmullen | 1740203389 | Pathology | 94.82 |
| Sher | 1952757411 | Pathology | 94.78 |
| Ficarra | 1326220062 | Pathology | 94.76 |
| Xu | 1770743593 | Pathology | 94.73 |
| Lewis | 1326070962 | Pathology | 94.68 |
| Barton | 1467431247 | Pathology | 94.68 |
| Lancia | 1700959590 | Pathology | 94.51 |
| Mcpherson | 1538184320 | Pathology | 94.48 |
| Migliozzi | 1013064021 | Pathology | 94.37 |
| Lastarria | 1437241304 | Pathology | 94.29 |
| Angell | 1205870870 | Thoracic Surgery | 92.30 |
| Vogelbaum | 1598732380 | Neurosurgery | 90.57 |
| Etame | 1639292287 | Neurosurgery | 89.56 |
| Haydon | 1669677514 | Neurosurgery | 89.30 |
| Carvallo | 1053416412 | Nephrology | 87.40 |
| Badillo | 1871588350 | Nephrology | 84.57 |
| Abdel Rahman | 1134383615 | Nephrology | 84.54 |
| Rizzo | 1801870407 | Nephrology | 84.33 |
| Toledo-Nieves | 1750838702 | Nephrology | 83.78 |
| Watson Clinic Llp | 1558409730 | Clinical Laboratory | 83.12 |
| Elite Bio Reference Laboratory Llc | 1275292294 | Clinical Laboratory | 82.95 |
| Moodey | 1124231774 | Critical Care (Intensivists) | 81.93 |
| Hematopathology Associates Llc | 1164523478 | Clinical Laboratory | 81.24 |
| International Medical Laboratory Inc | 1376518241 | Clinical Laboratory | 79.01 |
| Angirekula | 1710942206 | Pain Management | 77.26 |
| Campbell | 1548377906 | Critical Care (Intensivists) | 76.90 |
| Baldonado | 1992221444 | Thoracic Surgery | 76.84 |
| Boyle | 1003938366 | Pathology | 76.12 |
| Bossler | 1679554612 | Pathology | 76.12 |
| Qin | 1881787877 | Pathology | 76.12 |
| Saeed-Vafa | 1932433570 | Pathology | 75.87 |
| Bowes Imaging Center Llc. | 1316985658 | Independent Diagnostic Testing Facility (IDTF) | 75.20 |
| Coppola | 1730119454 | Pathology | 73.76 |
| Williams | 1598744393 | Pathology | 73.57 |
| Balsera | 1598773376 | Nephrology | 73.39 |
| Lubin | 1508834706 | Pain Management | 73.04 |
| Bovee | 1427346683 | Ophthalmology | 72.91 |
# Step 5C: Get Top 3 Providers by Specialty without repeating NPIs
top3_by_specialty <- scored_named %>%
filter(!is.na(Composite_Audit_Score)) %>%
group_by(Provider_Type, NPI) %>%
slice_max(order_by = Composite_Audit_Score, n = 1) %>% # get max score per NPI
ungroup() %>%
distinct(NPI, .keep_all = TRUE) %>% # remove any duplicate NPIs
group_by(Provider_Type) %>%
slice_max(order_by = Composite_Audit_Score, n = 3) %>% # now take top 3 per specialty
ungroup()
# Display Top 3 Per Specialty Table
knitr::kable(
top3_by_specialty %>%
select(Provider_Type, Provider_Last_Name, NPI, Composite_Audit_Score),
caption = "Top 3 Providers by Composite Score in Each Specialty"
)
| Provider_Type | Provider_Last_Name | NPI | Composite_Audit_Score |
|---|---|---|---|
| Addiction Medicine | Amen | 1831300995 | 0.98 |
| Advanced Heart Failure and Transplant Cardiology | Kumar | 1538267661 | 16.88 |
| Advanced Heart Failure and Transplant Cardiology | Oliveira | 1699879098 | 14.69 |
| Advanced Heart Failure and Transplant Cardiology | Arroyo | 1811965650 | 12.09 |
| All Other Suppliers | Brashears Vital Care Corp | 1710912662 | 0.70 |
| Allergy/ Immunology | Cho | 1093917304 | 26.84 |
| Allergy/ Immunology | Sher | 1386603108 | 23.84 |
| Allergy/ Immunology | Alshaar | 1669411740 | 17.88 |
| Ambulance Service Provider | City Of Temple Terrace | 1134177389 | 16.41 |
| Ambulance Service Provider | Affordable Transport Inc. | 1225279318 | 15.24 |
| Ambulance Service Provider | Americare Ambulance Service Inc | 1073543021 | 15.21 |
| Ambulatory Surgical Center | Coastal Medical Center Llc | 1538231956 | 41.57 |
| Ambulatory Surgical Center | Citrus Urology Center, Inc. | 1821068289 | 37.57 |
| Ambulatory Surgical Center | Riverwalk Ambulatory Surgery Center Llc | 1588839609 | 34.62 |
| Anesthesiology | Beebe | 1831395896 | 55.49 |
| Anesthesiology | Fitzpatrick | 1750764148 | 54.11 |
| Anesthesiology | Kaiafas | 1386676773 | 53.13 |
| Anesthesiology Assistant | Pierre | 1275925109 | 26.38 |
| Anesthesiology Assistant | Hess | 1013903046 | 24.75 |
| Anesthesiology Assistant | Baltodano | 1972157824 | 22.76 |
| Audiologist | Malkiewicz | 1922475805 | 37.52 |
| Audiologist | Chen | 1053048991 | 33.71 |
| Audiologist | Schenk | 1154592293 | 33.69 |
| Cardiac Surgery | Ezeldin | 1639129133 | 69.60 |
| Cardiac Surgery | Vesco | 1487641098 | 45.96 |
| Cardiac Surgery | Sanabria | 1386698041 | 45.52 |
| Cardiology | Carrillo | 1821092370 | 44.95 |
| Cardiology | Makati | 1467541953 | 44.37 |
| Cardiology | De Azevedo Filho | 1619467529 | 43.95 |
| Centralized Flu | Walgreen Co | 1396750584 | 13.30 |
| Centralized Flu | Walgreen Co | 1174538474 | 13.28 |
| Centralized Flu | Walgreen Co | 1558376731 | 13.27 |
| Certified Clinical Nurse Specialist | Allspaw | 1154380921 | 1.61 |
| Certified Nurse Midwife | Mccormick | 1558366336 | 0.98 |
| Certified Nurse Midwife | Harrington | 1366430449 | 0.93 |
| Certified Nurse Midwife | Dibartolo | 1952812042 | 0.65 |
| Certified Registered Nurse Anesthetist (CRNA) | Alea | 1881662187 | 40.67 |
| Certified Registered Nurse Anesthetist (CRNA) | Alvarodiaz | 1427199165 | 40.55 |
| Certified Registered Nurse Anesthetist (CRNA) | Horowitz | 1891893236 | 40.53 |
| Chiropractic | Bretz | 1124128319 | 0.41 |
| Chiropractic | Paisley | 1730616848 | 0.41 |
| Chiropractic | Tellier | 1750452868 | 0.41 |
| Clinical Cardiac Electrophysiology | Perzanowski-Obregon | 1649229758 | 66.63 |
| Clinical Cardiac Electrophysiology | Yamamura | 1083603989 | 64.86 |
| Clinical Cardiac Electrophysiology | Cardona | 1588626410 | 62.52 |
| Clinical Laboratory | Quest Diagnostics Clinical Laboratories Inc | 1891731626 | 439.31 |
| Clinical Laboratory | Laboratory Corporation Of America | 1255314704 | 402.49 |
| Clinical Laboratory | Pine Brook Pharmacy Llc | 1558596932 | 101.32 |
| Colorectal Surgery (Proctology) | Frattini | 1831367424 | 53.34 |
| Colorectal Surgery (Proctology) | Shore | 1235138983 | 38.26 |
| Colorectal Surgery (Proctology) | Honer | 1801864079 | 36.12 |
| Critical Care (Intensivists) | Vanvliet | 1841372968 | 132.50 |
| Critical Care (Intensivists) | Moodey | 1124231774 | 82.51 |
| Critical Care (Intensivists) | Campbell | 1548377906 | 77.46 |
| Dermatology | Mavropoulos | 1063737682 | 39.93 |
| Dermatology | Miller | 1023007242 | 31.42 |
| Dermatology | Esguerra | 1205825486 | 29.24 |
| Diagnostic Radiology | Dawson | 1033537725 | 41.82 |
| Diagnostic Radiology | Murtagh | 1538142567 | 41.30 |
| Diagnostic Radiology | Clark | 1902864341 | 41.22 |
| Emergency Medicine | Regan | 1891719050 | 52.37 |
| Emergency Medicine | Odome | 1245641240 | 52.10 |
| Emergency Medicine | Ratcliffe | 1952339103 | 50.25 |
| Endocrinology | Antunes | 1720032253 | 35.90 |
| Endocrinology | Piotrowska | 1205151453 | 34.27 |
| Endocrinology | Pinero-Pilona | 1720082068 | 33.52 |
| Family Practice | Davson Sterling | 1861709610 | 29.96 |
| Family Practice | Reddy | 1003377268 | 29.33 |
| Family Practice | Khurana | 1952357816 | 29.29 |
| Gastroenterology | George | 1467804179 | 49.90 |
| Gastroenterology | Khan | 1013928282 | 49.23 |
| Gastroenterology | Castineira | 1760830731 | 48.37 |
| General Practice | Hammarquist | 1033650551 | 61.19 |
| General Practice | Rudelli | 1184260283 | 61.03 |
| General Practice | Perkins | 1700229838 | 47.38 |
| General Surgery | Mitchell | 1033166277 | 77.89 |
| General Surgery | Parrack | 1134369002 | 77.42 |
| General Surgery | Ruan | 1285711275 | 77.24 |
| Geriatric Medicine | Kienle | 1740398676 | 28.90 |
| Geriatric Medicine | Cinelli | 1598793861 | 25.14 |
| Geriatric Medicine | Cherukuri | 1750375267 | 20.23 |
| Gynecological Oncology | Hahn | 1457604738 | 38.59 |
| Gynecological Oncology | South | 1336368547 | 31.40 |
| Gynecological Oncology | Stine | 1720310089 | 31.20 |
| Hand Surgery | Chan | 1114129236 | 38.52 |
| Hand Surgery | Klein | 1295773059 | 35.30 |
| Hand Surgery | Miller | 1154397297 | 34.78 |
| Hematology | Bergier | 1528098720 | 50.41 |
| Hematology | Giglia | 1417004201 | 18.99 |
| Hematology | Muhammad | 1942242938 | 18.24 |
| Hematology-Oncology | Maun | 1518940246 | 22.85 |
| Hematology-Oncology | Mulaparthi | 1427048537 | 20.60 |
| Hematology-Oncology | Harandi | 1003893199 | 20.40 |
| Hematology-Oncology | Gonter | 1295726933 | 20.40 |
| Hematopoietic Cell Transplantation and Cellular Therapy | Hansen | 1881958270 | 8.39 |
| Hematopoietic Cell Transplantation and Cellular Therapy | Jain | 1982139655 | 8.21 |
| Hematopoietic Cell Transplantation and Cellular Therapy | Mishra | 1225225865 | 8.19 |
| Home Infusion Therapy Services | Coram Healthcare Corporation Of Florida | 1417907627 | 0.29 |
| Hospice and Palliative Care | Suggs | 1497710172 | 66.20 |
| Hospice and Palliative Care | Uzabel | 1427019918 | 58.77 |
| Hospice and Palliative Care | Akkineni | 1205099744 | 57.31 |
| Hospitalist | Patel | 1427126036 | 61.49 |
| Hospitalist | Garcia | 1639188618 | 61.49 |
| Hospitalist | Chandler | 1487041620 | 61.47 |
| Independent Diagnostic Testing Facility (IDTF) | Bowes Imaging Center Llc. | 1316985658 | 77.65 |
| Independent Diagnostic Testing Facility (IDTF) | Diagnostic Medical Testing Inc | 1083672208 | 63.97 |
| Independent Diagnostic Testing Facility (IDTF) | Bowes Imaging Center, Llc | 1104198258 | 43.01 |
| Infectious Disease | Schreibman | 1609877190 | 35.06 |
| Infectious Disease | Cancel | 1740246644 | 34.90 |
| Infectious Disease | Nguyen | 1811159262 | 23.63 |
| Internal Medicine | Kadari | 1760046171 | 44.90 |
| Internal Medicine | Vyas | 1447690987 | 44.25 |
| Internal Medicine | Kanapathippillai | 1952530420 | 43.87 |
| Interventional Cardiology | Sivasankaran | 1265520043 | 26.91 |
| Interventional Cardiology | Khant | 1811959307 | 25.53 |
| Interventional Cardiology | Srivastava | 1932209608 | 24.10 |
| Interventional Pain Management | Jassal | 1255625513 | 32.64 |
| Interventional Pain Management | Ramos | 1831150630 | 32.34 |
| Interventional Pain Management | Samlaska | 1548291172 | 30.62 |
| Interventional Radiology | Nieves-Cruz | 1174703375 | 56.46 |
| Interventional Radiology | Niedzwiecki | 1578632147 | 47.87 |
| Interventional Radiology | Raymond | 1770856999 | 45.22 |
| Licensed Clinical Social Worker | Mcdaniel | 1144729427 | 1.07 |
| Licensed Clinical Social Worker | Thekkethottiyil | 1023376894 | 1.06 |
| Licensed Clinical Social Worker | Rosendale | 1366747115 | 1.06 |
| Licensed Clinical Social Worker | Cunningham | 1457494270 | 1.06 |
| Licensed Clinical Social Worker | Westfield | 1598070187 | 1.06 |
| Licensed Clinical Social Worker | Goodhand | 1902128879 | 1.06 |
| Licensed Clinical Social Worker | Eakes | 1912477712 | 1.06 |
| Licensed Clinical Social Worker | Yewell | 1922517556 | 1.06 |
| Licensed Clinical Social Worker | Johnston | 1942469648 | 1.06 |
| Mammography Center | Adventhealth West Florida Imaging Inc | 1316581234 | 1.25 |
| Mass Immunizer Roster Biller | Jahd Inc | 1275070716 | 18.37 |
| Mass Immunizer Roster Biller | Samson Merger Sub, Llc | 1932521358 | 18.37 |
| Mass Immunizer Roster Biller | Samson Merger Sub, Llc | 1700208121 | 18.35 |
| Maxillofacial Surgery | Chuong | 1558369678 | 15.34 |
| Maxillofacial Surgery | Hothersall | 1902912355 | 6.05 |
| Medical Oncology | Audeh | 1104817063 | 47.45 |
| Medical Oncology | Berry | 1669463378 | 42.28 |
| Medical Oncology | Lunin | 1003807645 | 41.86 |
| Micrographic Dermatologic Surgery | Worley | 1457823809 | 26.52 |
| Micrographic Dermatologic Surgery | Epstein | 1356588974 | 26.47 |
| Micrographic Dermatologic Surgery | Ewanowski | 1780880476 | 24.19 |
| Nephrology | Carvallo | 1053416412 | 87.40 |
| Nephrology | Badillo | 1871588350 | 84.57 |
| Nephrology | Abdel Rahman | 1134383615 | 84.54 |
| Neurology | Aung-Din | 1215987912 | 75.46 |
| Neurology | Reddy | 1952368433 | 70.66 |
| Neurology | Malka | 1235176322 | 64.77 |
| Neuropsychiatry | Bazargan | 1265461941 | 1.35 |
| Neurosurgery | Vogelbaum | 1598732380 | 96.63 |
| Neurosurgery | Haydon | 1669677514 | 96.25 |
| Neurosurgery | Etame | 1639292287 | 96.00 |
| Nuclear Medicine | Carter | 1659582286 | 21.00 |
| Nuclear Medicine | Miliziano | 1336154608 | 20.71 |
| Nuclear Medicine | Taher | 1386829984 | 18.87 |
| Nurse Practitioner | Rexroth | 1992921977 | 63.86 |
| Nurse Practitioner | Hawkins | 1427619329 | 63.66 |
| Nurse Practitioner | Hazelwood | 1851394274 | 63.46 |
| Obstetrics & Gynecology | Cox | 1023021789 | 41.23 |
| Obstetrics & Gynecology | Barreiro | 1093865743 | 41.23 |
| Obstetrics & Gynecology | Wahba | 1124178116 | 41.23 |
| Obstetrics & Gynecology | Mersch | 1154858322 | 41.23 |
| Obstetrics & Gynecology | Michael | 1508155904 | 41.23 |
| Obstetrics & Gynecology | Watkins | 1528054509 | 41.23 |
| Obstetrics & Gynecology | Hardy-Hunter | 1598940074 | 41.23 |
| Obstetrics & Gynecology | Calderon | 1669478939 | 41.23 |
| Obstetrics & Gynecology | Mcneill | 1740548700 | 41.23 |
| Obstetrics & Gynecology | Miller | 1790771731 | 41.23 |
| Obstetrics & Gynecology | Curtis | 1821402868 | 41.23 |
| Obstetrics & Gynecology | Gomez | 1851522825 | 41.23 |
| Obstetrics & Gynecology | Vonthron | 1881680825 | 41.23 |
| Obstetrics & Gynecology | Weiss | 1922094986 | 41.23 |
| Occupational Therapist in Private Practice | Martin | 1285759555 | 15.75 |
| Occupational Therapist in Private Practice | Koesling | 1295388635 | 15.73 |
| Occupational Therapist in Private Practice | Finley | 1730707035 | 15.72 |
| Ophthalmology | Bovee | 1427346683 | 77.31 |
| Ophthalmology | Martino | 1073702031 | 68.45 |
| Ophthalmology | Levitt | 1720074974 | 63.51 |
| Opioid Treatment Program | Operation Par Inc | 1477611903 | 2.08 |
| Opioid Treatment Program | Maps Sarasota | 1477810638 | 2.08 |
| Opioid Treatment Program | Operation Par Inc | 1639237001 | 2.08 |
| Opioid Treatment Program | Operation Par Inc | 1972661411 | 2.08 |
| Optometry | Johnson | 1114925773 | 11.66 |
| Optometry | Buffano | 1275765265 | 11.15 |
| Optometry | Sorkin | 1902939010 | 10.71 |
| Oral Surgery (Dentist only) | Moore | 1164564951 | 17.42 |
| Oral Surgery (Dentist only) | Vorwald | 1689961385 | 13.41 |
| Oral Surgery (Dentist only) | Banachowski | 1033484944 | 7.14 |
| Orthopedic Surgery | Cronin | 1821479759 | 63.01 |
| Orthopedic Surgery | Hess | 1578568291 | 60.50 |
| Orthopedic Surgery | Klima | 1518103316 | 57.50 |
| Osteopathic Manipulative Medicine | Von Lindeman | 1629146816 | 35.99 |
| Osteopathic Manipulative Medicine | Jones | 1184129983 | 29.75 |
| Osteopathic Manipulative Medicine | Miranda | 1225202575 | 29.48 |
| Otolaryngology | Parasher | 1962795781 | 27.32 |
| Otolaryngology | Pfaff | 1952664609 | 26.86 |
| Otolaryngology | Byers | 1710935911 | 25.93 |
| Pain Management | Angirekula | 1710942206 | 78.86 |
| Pain Management | Latif | 1568434868 | 75.47 |
| Pain Management | Lubin | 1508834706 | 73.04 |
| Pathology | Migliozzi | 1013064021 | 98.56 |
| Pathology | Rojas | 1134383060 | 96.04 |
| Pathology | Mittal | 1992742084 | 96.04 |
| Pediatric Medicine | Rucker | 1194721522 | 35.30 |
| Pediatric Medicine | Patel | 1770005902 | 34.21 |
| Pediatric Medicine | Mesghali Morales | 1336362706 | 31.03 |
| Pharmacy | Maa Bhawani Inc | 1003104779 | 20.16 |
| Pharmacy | Balls Rexall Drugs Inc | 1801908736 | 18.55 |
| Pharmacy | Maahi Llc | 1184024788 | 18.08 |
| Physical Medicine and Rehabilitation | Genato | 1366468282 | 42.94 |
| Physical Medicine and Rehabilitation | Razvi | 1679859532 | 41.98 |
| Physical Medicine and Rehabilitation | Lockett | 1356410443 | 33.95 |
| Physical Therapist in Private Practice | Hauskey | 1578997052 | 1.73 |
| Physical Therapist in Private Practice | Thomas | 1932457330 | 1.06 |
| Physical Therapist in Private Practice | De La Torre | 1336516772 | 0.98 |
| Physician Assistant | Elomina | 1831811090 | 72.75 |
| Physician Assistant | Daignault | 1528652120 | 72.32 |
| Physician Assistant | Radhakrishnan | 1265150890 | 72.31 |
| Plastic and Reconstructive Surgery | Shulman | 1972690329 | 60.64 |
| Plastic and Reconstructive Surgery | Dayicioglu | 1164674081 | 58.15 |
| Plastic and Reconstructive Surgery | Smith | 1891707915 | 45.95 |
| Podiatry | Marsh | 1093722548 | 18.45 |
| Podiatry | Williams | 1477530905 | 17.64 |
| Podiatry | Zelna | 1376885517 | 16.55 |
| Portable X-Ray Supplier | Radiance Radiology Inc | 1134459720 | 13.16 |
| Portable X-Ray Supplier | Florida Imaging Specialists, Pa | 1790328185 | 9.41 |
| Portable X-Ray Supplier | Professional Portable Radiologic Services Inc. | 1033780051 | 9.02 |
| Preventive Medicine | Senatorov | 1679881809 | 38.96 |
| Preventive Medicine | Patel | 1750720025 | 24.10 |
| Preventive Medicine | Deveau | 1528301132 | 19.89 |
| Psychiatry | Qureshi | 1497166656 | 63.87 |
| Psychiatry | Shahid | 1598184145 | 63.83 |
| Psychiatry | Patel | 1205216058 | 62.59 |
| Psychologist, Clinical | Cortman | 1629160775 | 22.90 |
| Psychologist, Clinical | Todoroff | 1033129804 | 19.18 |
| Psychologist, Clinical | Simpson | 1194779470 | 18.70 |
| Pulmonary Disease | Plum | 1770902835 | 44.72 |
| Pulmonary Disease | Miller | 1790050979 | 33.69 |
| Pulmonary Disease | Cheing | 1750701652 | 32.69 |
| Radiation Oncology | Yu | 1740386358 | 45.15 |
| Radiation Oncology | Oliver | 1053707521 | 44.96 |
| Radiation Oncology | Scott | 1679734370 | 36.06 |
| Registered Dietitian or Nutrition Professional | Grillo | 1538678347 | 0.92 |
| Registered Dietitian or Nutrition Professional | Solberger | 1740882141 | 0.35 |
| Registered Dietitian or Nutrition Professional | Riley | 1033153788 | 0.31 |
| Registered Dietitian or Nutrition Professional | Timmons | 1265593644 | 0.31 |
| Registered Dietitian or Nutrition Professional | Harbison | 1407832462 | 0.31 |
| Registered Dietitian or Nutrition Professional | Dominguez | 1568128676 | 0.31 |
| Registered Dietitian or Nutrition Professional | Figueroa | 1629591672 | 0.31 |
| Registered Dietitian or Nutrition Professional | Ulm | 1780915116 | 0.31 |
| Registered Dietitian or Nutrition Professional | Gilpin | 1871163949 | 0.31 |
| Rheumatology | Johnston | 1932332384 | 43.20 |
| Rheumatology | Drucker | 1790760783 | 41.42 |
| Rheumatology | Zito | 1407992597 | 34.47 |
| Sleep Medicine | Agarwal | 1932497773 | 16.46 |
| Sleep Medicine | Anderson | 1699783845 | 11.23 |
| Sleep Medicine | Guzman | 1881198547 | 9.05 |
| Speech Language Pathologist | Mccord | 1194072017 | 37.84 |
| Speech Language Pathologist | Thornberry | 1831822717 | 37.84 |
| Speech Language Pathologist | Rejonis | 1720361595 | 37.82 |
| Sports Medicine | Yi | 1235367483 | 66.23 |
| Sports Medicine | Gorman | 1659507507 | 59.98 |
| Sports Medicine | Dhani | 1366949935 | 48.54 |
| Surgical Oncology | Moskal | 1619949831 | 36.40 |
| Surgical Oncology | Meredith | 1659329217 | 35.87 |
| Surgical Oncology | Laronga | 1659398402 | 31.37 |
| Thoracic Surgery | Angell | 1205870870 | 92.69 |
| Thoracic Surgery | Baldonado | 1992221444 | 84.54 |
| Thoracic Surgery | Lambert | 1558369454 | 73.18 |
| Undefined Physician type | Kennon | 1053675868 | 66.37 |
| Undefined Physician type | Sweeney | 1215103775 | 17.58 |
| Undefined Physician type | Hunt | 1871579946 | 13.70 |
| Undersea and Hyperbaric Medicine | Latimer-Pierson | 1740431469 | 10.88 |
| Undersea and Hyperbaric Medicine | Valdes | 1912958588 | 6.29 |
| Urology | Graves | 1164684239 | 29.27 |
| Urology | Everett | 1780039743 | 23.15 |
| Urology | Szell | 1124316088 | 23.05 |
| Vascular Surgery | Nedurian | 1417942681 | 53.24 |
| Vascular Surgery | Davis | 1023335601 | 48.17 |
| Vascular Surgery | Purcell | 1144587841 | 46.27 |
The primary goal of this analysis is to analyze Medicare provider data using k-means clustering, assign meaningful cluster labels to providers based on cost and service volume, and visualize the geographic distribution of these clusters across ZIP codes in Florida using an interactive map.
# ---- Step 3: K-Means Clustering ----
# K-Means Clustering
df_raw <- filtered_data %>%
mutate(
across(
.cols = c(Total_Services, Avg_Submitted_Charge, Avg_Medicare_Payment),
.fns = ~as.numeric(as.character(.))
)
) %>%
select(NPI, Total_Services, Avg_Submitted_Charge, Avg_Medicare_Payment) %>%
na.omit()
stopifnot(all(sapply(df_raw, is.numeric)))
df_scaled <- scale(df_raw)
k <- min(4, nrow(unique(df_scaled)))
if (k < 2) stop("Not enough data to form clusters.")
set.seed(123)
clusters <- kmeans(df_scaled, centers = k, nstart = 25)
cluster_labels <- c(
"1" = "Moderate Cost, High Volume",
"2" = "Typical Providers",
"3" = "High Cost, Low Volume",
"4" = "Low Cost, Very High Volume"
)
results <- df_raw %>%
mutate(
Cluster = clusters$cluster,
Cluster_Label = cluster_labels[as.character(Cluster)]
)
# Mapping
zcta_shapes <- tigris::zctas(cb = TRUE, year = 2020) %>%
rename(Zip = ZCTA5CE20) %>%
mutate(Zip = sprintf("%05s", as.character(Zip)))
# Create NPI to ZIP mapping necessary for join
npi_zip_map <- filtered_data %>%
mutate(NPI = as.character(NPI)) %>%
select(NPI, Provider_Zip) %>%
distinct(NPI, .keep_all = TRUE)
zip_cluster_data <- results %>%
mutate(NPI = as.character(NPI)) %>%
left_join(npi_zip_map, by = "NPI") %>%
group_by(Provider_Zip, Cluster_Label) %>%
summarise(Count = n(), .groups = "drop")
zip_cluster_sf <- left_join(zcta_shapes,
zip_cluster_data %>%
mutate(Provider_Zip = sprintf("%05s", as.character(Provider_Zip))),
by = c("Zip" = "Provider_Zip")) %>%
filter(!is.na(Cluster_Label)) %>%
st_make_valid() %>%
filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON")) %>%
st_transform(4326)
zip_cluster_sf <- zip_cluster_sf %>%
mutate(
fill_color = dplyr::recode(Cluster_Label,
"Moderate Cost, High Volume" = "#1f78b4",
"Typical Providers" = "#33a02c",
"High Cost, Low Volume" = "#e31a1c",
"Low Cost, Very High Volume" = "#ff7f00",
.default = "#cccccc"
),
opacity = scales::rescale(Count, to = c(0.4, 1))
)
# Interactive Leaflet Map
leaflet_map <- leaflet::leaflet(zip_cluster_sf) %>%
leaflet::addProviderTiles(leaflet::providers$CartoDB.Positron) %>%
leaflet::addPolygons(
fillColor = ~fill_color,
fillOpacity = ~opacity,
color = "#444444",
weight = 0.5,
popup = ~paste0(
"<b>ZIP:</b> ", Zip, "<br>",
"<b>Cluster:</b> ", Cluster_Label, "<br>",
"<b>Provider Count:</b> ", Count
)
) %>%
leaflet::addLegend(
"bottomright",
colors = c("#1f78b4", "#33a02c", "#e31a1c", "#ff7f00"),
labels = c(
"Moderate Cost, High Volume",
"Typical Providers",
"High Cost, Low Volume",
"Low Cost, Very High Volume"
),
title = "Cluster Type",
opacity = 1
)
htmlwidgets::saveWidget(leaflet_map, "Output/Medicare_Clusters_FL.html", selfcontained = TRUE)
leaflet_map
According to the provider statistics, Cluster 2 is the most common, accounting for the majority of providers, while Cluster 3 is under-represented, implying potential clustering imbalances. Cluster 2 has a wide range of values for total services, average submitted charge, and average Medicare payment, with average Medicare payments at $70.23, average submitted charges at $356.84, and an average of 604 total services; however, Cluster 3 has higher costs and lower service volumes, with average Medicare payments at $425.36, average submitted charges at $4,490.86, and an average of 29 total services. These findings indicate that the clustering technique should be reconsidered, possibly using new algorithms or feature engineering, as well as a more rigorous outlier analysis within Cluster 2 to assess whether some providers should be reclassified in order to establish more distinct and balanced groups.
The primary goal of this methodology is to identify and understand factors influencing average Medicare payments.
# Load and clean with basic numeric conversion
df_lm <- filtered_data %>%
select(
Avg_Medicare_Payment,
Total_Services,
Avg_Submitted_Charge,
Total_Beneficiaries,
Place_Of_Service,
Provider_Type,
RUCA_Description
) %>%
mutate(
Medicare_Payment = parse_number(as.character(Avg_Medicare_Payment)),
Services = parse_number(as.character(Total_Services)),
Submitted_Charge = parse_number(as.character(Avg_Submitted_Charge)),
Beneficiaries = parse_number(as.character(Total_Beneficiaries)),
Place_Of_Service = factor(Place_Of_Service),
Specialty = factor(Provider_Type),
RUCA_Description = factor(RUCA_Description)
) %>%
select(-Avg_Medicare_Payment, -Total_Services, -Avg_Submitted_Charge, -Total_Beneficiaries, -Provider_Type) %>%
filter(Medicare_Payment > 1.00) %>%
mutate(
Log_Medicare_Payment = log(Medicare_Payment)
) %>%
filter(is.finite(Log_Medicare_Payment)) %>%
mutate(
across(c(Services, Submitted_Charge, Beneficiaries),
list(z = ~scale(.)),
.names = "{.col}_z")
) %>%
group_by(Specialty) %>%
filter(n() >= 50) %>%
ungroup %>%
drop_na() %>%
select(
Log_Medicare_Payment,
Services_z, Submitted_Charge_z, Beneficiaries_z,
Place_Of_Service, RUCA_Description,
Specialty
)
# Run simplified regression (removed HCPCS_Description)
mixed_model <- lmer(
Log_Medicare_Payment ~ Services_z + Submitted_Charge_z + Beneficiaries_z +
Place_Of_Service + RUCA_Description +
(1 | Specialty),
data = df_lm,
)
# Raw Model Summary
summary(mixed_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## Log_Medicare_Payment ~ Services_z + Submitted_Charge_z + Beneficiaries_z +
## Place_Of_Service + RUCA_Description + (1 | Specialty)
## Data: df_lm
##
## REML criterion at convergence: 483082.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -14.3763 -0.4859 0.1073 0.6289 6.4610
##
## Random effects:
## Groups Name Variance Std.Dev.
## Specialty (Intercept) 0.1946 0.4412
## Residual 1.0409 1.0203
## Number of obs: 167701, groups: Specialty, 76
##
## Fixed effects:
## Estimate
## (Intercept) 4.174173
## Services_z -0.021389
## Submitted_Charge_z 0.337635
## Beneficiaries_z -0.002353
## Place_Of_ServiceO -0.221650
## RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater -0.291418
## RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater 0.114430
## RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999 -0.084599
## RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater 0.047710
## RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999 -0.212993
## Std. Error
## (Intercept) 0.051182
## Services_z 0.003050
## Submitted_Charge_z 0.002778
## Beneficiaries_z 0.003073
## Place_Of_ServiceO 0.006479
## RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater 0.017488
## RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater 0.067628
## RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999 0.087376
## RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater 0.011288
## RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999 0.242755
## t value
## (Intercept) 81.556
## Services_z -7.014
## Submitted_Charge_z 121.520
## Beneficiaries_z -0.766
## Place_Of_ServiceO -34.210
## RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater -16.664
## RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater 1.692
## RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999 -0.968
## RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater 4.227
## RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999 -0.877
##
## Correlation of Fixed Effects:
## (Intr) Srvcs_ Sbm_C_ Bnfcr_ P_O_SO Rahcpf3omtauao5ag
## Services_z -0.001
## Sbmttd_Chr_ -0.009 0.002
## Beneficrs_z 0.001 -0.572 0.001
## Plc_Of_SrvO -0.085 -0.001 0.067 -0.001
## Rahcpf3omtauao5ag -0.002 0.000 0.000 -0.002 -0.031
## Ralcpf1t<tauao5ag -0.001 0.002 -0.002 -0.001 -0.009 0.007
## Racpfwauco1t4 -0.002 0.001 0.000 0.000 0.000 0.005
## Rf3t<taluao5ag -0.007 0.000 0.000 0.001 -0.026 0.036
## Rtcpfwauco2t9 -0.013 0.000 0.003 0.000 -0.008 0.001
## Ralcpf1t<tauao5ag Racpfwauco1t4 Rf3t<taluao5ag
## Services_z
## Sbmttd_Chr_
## Beneficrs_z
## Plc_Of_SrvO
## Rahcpf3omtauao5ag
## Ralcpf1t<tauao5ag
## Racpfwauco1t4 0.002
## Rf3t<taluao5ag 0.010 0.008
## Rtcpfwauco2t9 0.000 0.000 0.001
# Fixed Effects Summary
knitr::kable(tidy(mixed_model, effects = "fixed", conf.int = TRUE), digits =3)
| effect | term | estimate | std.error | statistic | conf.low | conf.high |
|---|---|---|---|---|---|---|
| fixed | (Intercept) | 4.174 | 0.051 | 81.556 | 4.074 | 4.274 |
| fixed | Services_z | -0.021 | 0.003 | -7.014 | -0.027 | -0.015 |
| fixed | Submitted_Charge_z | 0.338 | 0.003 | 121.520 | 0.332 | 0.343 |
| fixed | Beneficiaries_z | -0.002 | 0.003 | -0.766 | -0.008 | 0.004 |
| fixed | Place_Of_ServiceO | -0.222 | 0.006 | -34.210 | -0.234 | -0.209 |
| fixed | RUCA_DescriptionMetropolitan area high commuting: primary flow 30% or more to a urbanized area of 50,000 and greater | -0.291 | 0.017 | -16.664 | -0.326 | -0.257 |
| fixed | RUCA_DescriptionMetropolitan area low commuting: primary flow 10% to <30% to a urbanized area of 50,000 and greater | 0.114 | 0.068 | 1.692 | -0.018 | 0.247 |
| fixed | RUCA_DescriptionMicropolitan area core: primary flow within an urban cluster of 10,000 to 49,999 | -0.085 | 0.087 | -0.968 | -0.256 | 0.087 |
| fixed | RUCA_DescriptionSecondary flow 30% to <50% to a larger urbanized area of 50,000 and greater | 0.048 | 0.011 | 4.227 | 0.026 | 0.070 |
| fixed | RUCA_DescriptionSmall town core: primary flow within an urban cluster of 2,500 to 9,999 | -0.213 | 0.243 | -0.877 | -0.689 | 0.263 |
# Random Effects Summary
knitr::kable(tidy(mixed_model, effects = "ran_pars", scales = "vcov"), digits = 3)
| effect | group | term | estimate |
|---|---|---|---|
| ran_pars | Specialty | var__(Intercept) | 0.195 |
| ran_pars | Residual | var__Observation | 1.041 |
# Key fixed effect drivers
fixed_effects <- tidy(mixed_model,
effects = "fixed",
conf.int = TRUE)
plot_data_fixed <- fixed_effects %>%
filter(term %in% c("Submitted_Charge_z", "Place_Of_ServiceO")) %>%
mutate(
# Create clearer names for the plot axis
term_cleaned = case_when(
term == "Submitted_Charge_z" ~ "Submitted Charge Impact",
term == "Place_Of_ServiceO" ~ "Office vs. Facility Impact",
TRUE ~ term
)
)
plot_key_drivers <- ggplot(plot_data_fixed, aes(x = estimate, y = reorder(term_cleaned, estimate))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey70") +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.15, linewidth = 0.7) +
geom_point(size = 3.5, color = "dodgerblue") +
labs(
title = "Key Factors Influencing Average Medicare Payment",
subtitle = "Estimated Impact (Positive or Negative)",
x = "Estimated Impact on Medicare Payment",
y = "Factor"
) +
theme_minimal(base_size = 14) +
theme(panel.grid.major.y = element_blank())
ggsave("Output/plot_key_drivers_opaque.png",
plot = plot_key_drivers,
width = 8,
height = 4,
bg = "white"
)
ggsave("Output/plot_key_drivers_transparent.png",
plot = plot_key_drivers,
width = 8,
height = 4
)
# Display the plot
plot_key_drivers
# Variation between specialties
specialty_effects <- ranef(mixed_model,
condVar = FALSE)$Specialty %>%
rename(Intercept_Deviation = `(Intercept)`) %>%
tibble::rownames_to_column("Specialty")
plot_specialty_variation <- ggplot(specialty_effects, aes(x = Intercept_Deviation, y = reorder(Specialty, Intercept_Deviation))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey70") +
geom_point(size = 2, color = "forestgreen") +
labs(
title = "Variation in Baseline Payments Across Specialties",
subtitle = "Deviation from Average Baseline Medicare Payment",
x = "Estimated Deviation from Average Baseline (Medicare Payment)",
y = "Medical Specialty"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.y = element_text(size = 8),
panel.grid.major.y = element_blank()
)
ggsave(
"Output/plot_specialty_variation_opaque.png",
plot = plot_specialty_variation,
width = 12,
height = 15,
units = "in",
dpi = 300,
bg = "white"
)
ggsave(
"Output/plot_specialty_variation_transparent.png",
plot = plot_specialty_variation,
width = 12,
height = 15,
units = "in",
dpi = 300
)
# Display the plot
plot_specialty_variation
## 📄 Summary of Linear Regression Findings After analyzing 170,000
Medicare claims across the Tampa Bay area, this statistical model
identifies key payment drivers. It uses a linear mixed-effects
methodology that is a subset of linear regression to find differences
between 76 different provider types while accounting for average charge
amounts.
The primary goal of the heat map visualization was to see at a glance average Medicare costs in the Tampa Bay region.
# File paths
shapefile_path <- "Data/tl_2010_12_zcta510.shp"
# Load Medicare data
medicare_cleaned <- filtered_data %>%
mutate(
Provider_Zip = str_pad(substr(as.character(Provider_Zip), 1, 5), 5, pad = "0"),
Avg_Medicare_Payment = as.numeric(gsub("[$,]", "", Avg_Medicare_Payment))
)
# Load shapefile
zcta <- st_read(shapefile_path) %>%
mutate(ZCTA5CE10 = as.character(ZCTA5CE10))
## Reading layer `tl_2010_12_zcta510' from data source
## `C:\Users\scott\Documents\School\LIS4761\Final\LIS4761_Final\CMS Final Project\Data\tl_2010_12_zcta510.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 983 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -87.62589 ymin: 24.51748 xmax: -80.02711 ymax: 31.00097
## Geodetic CRS: NAD83
# Choose specialty
selected_specialty <- "Internal Medicine"
# Filter and summarize
hm_filtered_data <- medicare_cleaned %>%
filter(Provider_Type == selected_specialty) %>%
group_by(Provider_Zip) %>%
summarize(
Avg_Payment = mean(Avg_Medicare_Payment, na.rm = TRUE),
Providers = n(),
.groups = "drop"
)
# Merge with shapefile
merged_data <- zcta %>%
left_join(hm_filtered_data, by = c("ZCTA5CE10" = "Provider_Zip"))
# Filter with Tampa Bay ZIP codes
tampa_map_data <- merged_data %>%
filter(ZCTA5CE10 %in% tampa_bay_zips)
# Create heat map
tampa_plot <- ggplot(data = tampa_map_data) +
geom_sf(aes(fill = Avg_Payment), color = "white", size = 0.2) +
scale_fill_viridis_c(
option = "plasma",
na.value = "grey90",
name = "Avg Payment ($)",
limits = c(0, 150),
breaks = c(0, 50, 100, 150),
oob = scales::squish
) +
labs(
title = "Medicare Average Payment by ZIP Code in Tampa Bay",
subtitle = paste("Specialty:", selected_specialty),
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "right",
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 13),
plot.caption = element_text(size = 10, face = "italic")
)
# Print map
tampa_plot
# Save the heat map to the output folder
ggsave(
filename = "Output/heatmap_transparent.png",
plot = tampa_plot,
width = 10,
height = 6,
dpi = 300
)
ggsave(
filename = "Output/heatmap_opaque.png",
plot = tampa_plot,
width = 10,
height = 6,
dpi = 300,
bg = "white"
)
# Display the plot
tampa_plot
## 📄 Summary of Heat Map Visualization Findings Here is a summary of
our analysis based on Medicare payment data visualization:
We analyzed average Medicare payments for providers specializing in Internal Medicine across Tampa Bay ZIP codes. This process began by loading and preparing a cleaned Medicare dataset and a shapefile representing Florida ZIP Code Tabulation Areas (ZCTAs).
After filtering the data for the selected specialty (“Internal Medicine”), we calculated the average Medicare payment and the number of providers per ZIP code. This summarized dataset was then merged with the shapefile data to enable geographic plotting.
Using ggplot2 and the sf package, we created a choropleth map showing the distribution of average Medicare payments across Tampa Bay. Areas with higher average payments appear in brighter plasma shades, while areas with lower payments or no data are shown in muted tones.
The heatmap helps reveal regional disparities in Medicare reimbursements, highlighting ZIP codes with notably higher or lower average payments. These visual insights can inform further investigation into geographic trends in healthcare spending, potential inequities in reimbursement rates, or gaps in provider coverage for Internal Medicine in Tampa Bay.